{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# *Putting the sorting hat on J.K. Rowling’s reader. A digital inquiry into the age of the implied readership of the Harry Potter series*\n",
    "##### (Code accompanying the article)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dependencies"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "\n",
    "import spacy\n",
    "\n",
    "import benepar\n",
    "from benepar.spacy_plugin import BeneparComponent\n",
    "\n",
    "\n",
    "nlp = spacy.load(\"en_core_web_sm\")\n",
    "\n",
    "## Uncomment lines below if you want to use Berkeley Neural Parser (Benepar)\n",
    "## See: https://pypi.org/project/benepar/\n",
    "\n",
    "#benepar.download('benepar_en2')\n",
    "#nlp.add_pipe(BeneparComponent('benepar_en2'))\n",
    "\n",
    "## Make sure SpaCy can handle bigger files/longer texts (that take up more RAM)\n",
    "nlp.max_length = 13000000 "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Function for exploding pandas df's (i.e. cells that contain lists, will be expanded so that each value from that list gets assigned to a seperate cell)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def explode(df, lst_cols, fill_value=''):\n",
    "    # make sure `lst_cols` is a list\n",
    "    if lst_cols and not isinstance(lst_cols, list):\n",
    "        lst_cols = [lst_cols]\n",
    "    # all columns except `lst_cols`\n",
    "    idx_cols = df.columns.difference(lst_cols)\n",
    "\n",
    "    # calculate lengths of lists\n",
    "    lens = df[lst_cols[0]].str.len()\n",
    "\n",
    "    if (lens > 0).all():\n",
    "        # ALL lists in cells aren't empty\n",
    "        return pd.DataFrame({\n",
    "            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())\n",
    "            for col in idx_cols\n",
    "        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \\\n",
    "          .loc[:, df.columns]\n",
    "    else:\n",
    "        # at least one list in cells is empty\n",
    "        return pd.DataFrame({\n",
    "            col:np.repeat(df[col].values, df[lst_cols[0]].str.len())\n",
    "            for col in idx_cols\n",
    "        }).assign(**{col:np.concatenate(df[col].values) for col in lst_cols}) \\\n",
    "          .append(df.loc[lens==0, idx_cols]).fillna(fill_value) \\\n",
    "          .loc[:, df.columns]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Setting up the files to be analyzed"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Corpus definition. Define the corpus folder and explicitly provide the names of the txt-files you want to scrutinize."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "corpus_folder = '../HP1_annotated'\n",
    "\n",
    "texts_uk = ['uk1', 'uk2', 'uk3', 'uk4', 'uk5', 'uk6', 'uk7']\n",
    "hps = ['HP1', 'HP2', 'HP3', 'HP4', 'HP5', 'HP6', 'HP7']\n",
    "titles = ['The Philosopher\\'s Stone', 'The Chamber of Secrets', 'The Prisoner of Azkaban', 'The Goblet of Fire', 'The Order of the Phoenix', 'The Half-Blood Prince', 'The Deathly Hallows']\n",
    "chronology = [1, 2, 3, 4, 5, 6, 7]\n",
    "\n",
    "## Uncomment lines below if you want to expand the analysis to Rowling's adult novels\n",
    "\n",
    "#texts_uk = ['uk1', 'uk2', 'uk3', 'uk4', 'uk5', 'uk6', 'uk7', 'ROWLING_thecasualvacancy_2012', 'ROWLING_thecuckooscalling_2013', 'ROWLING_thesilkworm_2014', 'ROWLING_careerofevil_2015', 'ROWLING_lethalwhite_2018']\n",
    "#hps = ['HP1', 'HP2', 'HP3', 'HP4', 'HP5', 'HP6', 'HP7', 'The Casual Vacancy', 'The Cuckoo\\'s Calling', 'The Silkworm', 'Career of Evil', 'Lethal White']\n",
    "#titles = ['The Philosopher\\'s Stone', 'The Chamber of Secrets', 'The Prisoner of Azkaban', 'The Goblet of Fire', 'The Order of the Phoenix', 'The Half-Blood Prince', 'The Deathly Hallows', 'The Casual Vacancy', 'The Cuckoo\\'s Calling', 'The Silkworm', 'Career of Evil', 'Lethal White']\n",
    "#chronology = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 1. Characters, sentences, tokens, type-to-token-ratio (TTR) etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_counts = []\n",
    "\n",
    "for text, chron, hp, title in zip(texts_uk, chronology, hps, titles):\n",
    "    print(f'Tokenizing {title}...')\n",
    "    \n",
    "    with open(f'{corpus_folder}/{text}.txt','r') as file:\n",
    "        \n",
    "        # Read text\n",
    "        raw_text =  file.read()\n",
    "        \n",
    "        # Convert to SpaCy nlp-object\n",
    "        doc = nlp(raw_text)\n",
    "        \n",
    "        # Characters\n",
    "        characters = [ch for ch in raw_text]\n",
    "        \n",
    "        # Tokens\n",
    "        tokens = [token.text for token in doc]\n",
    "        \n",
    "        # Remove punctuation from tokens\n",
    "        tokens_no_punct = [token for token in tokens if token.isalpha()]\n",
    "        \n",
    "        # Lowercase tokens (no punctuation)\n",
    "        lowered_tokens = [t.lower() for t in tokens_no_punct]\n",
    "        \n",
    "        # Unique types\n",
    "        types = set(lowered_tokens)\n",
    "        \n",
    "        # Calculate type-to-token ratio (TTR)\n",
    "        ttr = (len(types) / len(lowered_tokens)) # unique lowered tokens / total number of lowered tokens\n",
    "        \n",
    "        # Sentences\n",
    "        sentences = [sent.string.strip() for sent in doc.sents]\n",
    "        sentence_lengths = []\n",
    "        \n",
    "        # Average sentence lenght\n",
    "        avg_sent_length = len(tokens_no_punct)/len(sentences)\n",
    "        print(avg_sent_length)\n",
    "        \n",
    "        print('\\tTokenizing sentences...')\n",
    "        for sentence in sentences:\n",
    "            doc_sent = nlp(sentence)\n",
    "            tokens_sent = [token.text for token in doc_sent]\n",
    "            tokens_sent_no_punct = [token for token in tokens_sent if token.isalpha()]\n",
    "            sentence_length = len(tokens_sent_no_punct)\n",
    "            sentence_lengths.append(sentence_length)\n",
    "            \n",
    "        df_counts.append({'File': text,\n",
    "                   'Title': title,\n",
    "                   'HP': hp,\n",
    "                   'Chronology': chron,\n",
    "                   'Characters': len(characters),\n",
    "                   'Sentences': len(sentences),\n",
    "                   'Average sent length': avg_sent_length, \n",
    "                   'Tokens': len(tokens),\n",
    "                   'Tokens no punct': len(tokens_no_punct),\n",
    "                   'Types': len(types),\n",
    "                   'Type-to-token ratio': ttr,\n",
    "                   'Sentences text': sentences, \n",
    "                   'Sentence lengths': sentence_lengths})"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_counts = pd.DataFrame(df_counts)\n",
    "df_counts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats\n",
    "import pprint\n",
    "\n",
    "potter_files = ['uk1', 'uk2', 'uk3', 'uk4', 'uk5', 'uk6', 'uk7']\n",
    "rowl_adult_files = ['ROWLING_thecasualvacancy_2012', 'ROWLING_thecuckooscalling_2013', 'ROWLING_thesilkworm_2014', 'ROWLING_careerofevil_2015', 'ROWLING_lethalwhite_2018']\n",
    "\n",
    "# average sentence lengths\n",
    "df_counts_potter = df_counts.loc[df_counts['File'].isin(potter_files)]\n",
    "print('Average total sentence length of Potter series', df_counts_potter['Average sent length'].mean())\n",
    "standard_deviation_potter = stats.tstd(df_counts_potter['Average sent length'])\n",
    "print('Standard dev of Potter-books:', standard_deviation_potter)\n",
    "\n",
    "print('---------------')\n",
    "\n",
    "df_counts_rowl_adult = df_counts.loc[df_counts['File'].isin(rowl_adult_files)]\n",
    "print('Average total sentence length of Rowlings adult books', df_counts_rowl_adult['Average sent length'].mean())\n",
    "standard_deviation_rowl_adult = stats.tstd(df_counts_rowl_adult['Average sent length'])\n",
    "print('Standard dev of Rowlins adult-books:', standard_deviation_rowl_adult)\n",
    "print('---------------')\n",
    "\n",
    "print('Average total sentence length (all Rowling books)', df_counts['Average sent length'].mean())\n",
    "standard_deviation_all = stats.tstd(df_counts['Average sent length'])\n",
    "print('Standard dev of all books:', standard_deviation_all)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sent_counts_exploded = df_counts.explode('Sentence lengths')\n",
    "\n",
    "## for some reason, the values in the column 'Sentence lengths' are stored as object, not as integers, so we'll have to convert them... \n",
    "df_sent_counts_exploded[\"Sentence lengths\"]=df_sent_counts_exploded[\"Sentence lengths\"].astype(int)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pp = pprint.PrettyPrinter(indent=4)\n",
    "\n",
    "x = list(df_sent_counts_exploded['Sentence lengths'])\n",
    "y = list(df_sent_counts_exploded['Chronology'])\n",
    "\n",
    "print('===================PEARSON\\'s R===================')\n",
    "gradient, intercept, r_value, p_value, std_err = stats.linregress(x, y)\n",
    "print(stats.linregress(x, y))\n",
    "\n",
    "print('===================TAU===================')\n",
    "tau, p_value_tau = stats.kendalltau(x, y)\n",
    "print('Tau:', tau)\n",
    "print('P-value for Tau:', p_value_tau)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "sns.set(style=\"whitegrid\")\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "mean=df_sent_counts_exploded['Sentence lengths'].mean()\n",
    "\n",
    "plt.figure(figsize=(17, 10))\n",
    "chart = sns.boxplot('HP', 'Sentence lengths', data=df_sent_counts_exploded, showfliers=False, meanprops={\"linestyle\":\"--\", \"linewidth\":4, \"color\":'white'})\n",
    "chart.set_xticklabels(chart.get_xticklabels(), rotation=15)\n",
    "chart.set(xlabel=None, ylabel='Sentence length')\n",
    "#plt.savefig(\"gfx/boxplot_sentence_lengths.svg\", dpi=500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import altair as alt\n",
    "\n",
    "base = alt.Chart(df_counts).properties(width=500, height=200)\n",
    "\n",
    "line_sent = base.mark_bar(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Average sent length', sort=[\"HP\"], title=['Average sentence length', '(punctuation discarded)'], scale=alt.Scale(domain=(9.5, 13.5)))).configure_axisX(labelAngle=0)\n",
    "\n",
    "## Add grid lines\n",
    "line_sent = line_sent.configure_axis(grid=True)\n",
    "\n",
    "#line_sent.save('../../output/lexicaldiversity/sentence_length.png')\n",
    "\n",
    "line_sent"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 2. Lexical diversity (i.e. ratio of total words to unique words)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "*Lexical diversity (or: richness) of a text can be calculated by taking the ratio of unique types and total types. I.e. words without punctuation / types. Lexical diversity refers to \"the range of different words used in a text, with a greater range indicating a higher diversity”.*\n",
    "-- McCarthy and Jarvis 2010: 381\n",
    "\n",
    "**McCarthy**, P.M., **Jarvis**, S. MTLD, vocd-D, and HD-D: A validation study of sophisticated approaches to lexical diversity assessment. Behavior Research Methods 42, 381–392 (2010). Online: https://link.springer.com/article/10.3758/BRM.42.2.381"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "chart = alt.Chart(df_counts).mark_circle().encode(\n",
    "    alt.X('Tokens no punct:O', title='Number of tokens (without punctuation)'),\n",
    "    alt.Y('Type-to-token ratio:Q', title='Type-to-token ratio', bin=True))\n",
    "\n",
    "text = chart.mark_text(\n",
    "    align='left',\n",
    "    baseline='middle',\n",
    "    dx=4  # Nudges text to right so it doesn't appear on top of the bar\n",
    ").encode(\n",
    "    text='HP:N')\n",
    "\n",
    "(chart + text).properties(height=350, width=500).configure_axisX(labelAngle=0)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "The **points in the graph approximate a straight line** that goes from the bottom left corner to the top right corner. In other words: is it true that as a novel is \"lengthier\", the lexicon is more diverse? In other words, is lexical diversity is proportional to story length?\n",
    "\n",
    "To find out, we try to find the straight line that best fits the different points in the graph. To do this, we perform **linear regression**."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from altair import RegressionTransform\n",
    "\n",
    "chart = alt.Chart(df_counts).mark_circle().encode(\n",
    "    alt.X('Tokens no punct', title='Number of tokens (without punctuation)', scale=alt.Scale(zero=False)),\n",
    "    alt.Y('Type-to-token ratio', scale=alt.Scale(zero=False))\n",
    ")\n",
    "\n",
    "text = chart.mark_text(\n",
    "    align='left',\n",
    "    baseline='middle',\n",
    "    dx=5  # Nudges text to right so it doesn't appear on top of the bar\n",
    ").encode(\n",
    "    text='HP:N')\n",
    "\n",
    "ttr_lin_regr = (chart.transform_regression('Type-to-token ratio', 'Tokens no punct').mark_line(color='#D35400') + (chart + text)).properties(height=350, width=500)\n",
    "#ttr_lin_regr.save('../../output/lexicaldiversity/ttr_lin_regr.svg')\n",
    "#ttr_lin_regr.save('../../output/lexicaldiversity/ttr_lin_regr.html')\n",
    "ttr_lin_regr"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Let's get some statistics on the calculated linear regression line."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats\n",
    "import pprint\n",
    "pp = pprint.PrettyPrinter(indent=4)\n",
    "\n",
    "x = list(df_counts['Tokens no punct'])\n",
    "y = list(df_counts['Type-to-token ratio'])\n",
    "\n",
    "gradient, intercept, r_value, p_value, std_err = stats.linregress(x,y)\n",
    "print(stats.linregress(x,y))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**Conclusion**: There is a clear correlation between TTR and the number of tokens..."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "\"A reliable index of lexical diversity (LD) has remained stubbornly elusive for over 60 years” -- McCarthy & Jarvis 2007\n",
    "\n",
    "See: https://journals.sagepub.com/doi/10.1177/0265532207080767\n",
    "\n",
    "\"The main problem is that some measures of LD do not take account fully of differences in text length. In other words, if they are used on texts of different lengths they can give misleading results. (For example, the traditional measure of type/token ratio (TTR) is susceptible to this problem.\"\n",
    "\n",
    "-- https://textinspector.com/help/lexical-diversity/\n",
    "\n",
    "\"The simplest measure of lexical diversity is the type-token ratio (TTR, Johnson, 1939). [...] However, the rate at which new word types appear in a text decreases as the size of the text increases, and in consequence TTR’s are not comparable unless they are based on texts of the same length.\"\n",
    "\n",
    "-- Wachal & Spreen (http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.842.6984&rep=rep1&type=pdf)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "**More reliable measures** to gain insight into lexical diversity are: Giraud, MTLD, MSTTR, HDD, etc."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "See: https://core.ac.uk/download/pdf/82620241.pdf"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## https://github.com/kristopherkyle/lexical_diversity\n",
    "!pip install lexical-diversity\n",
    "from lexical_diversity import lex_div as ld"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 2.1. Calculate lexical diversities for **entire** texts"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_lex_div = []\n",
    "\n",
    "for text, hp, chron, title in zip(texts_uk, hps, chronology, titles):\n",
    "    print(f'Tokenizing {title}...')\n",
    "    with open(f'{corpus_folder}/{text}.txt','r') as file:\n",
    "        raw_text =  file.read()\n",
    "        doc = nlp(raw_text)\n",
    "        tokens = [token.text for token in doc]\n",
    "        tokens_no_punct = [token for token in tokens if token.isalpha()]\n",
    "        lowered_tokens_no_punct = [t.lower() for t in tokens_no_punct]\n",
    "        \n",
    "        ## Uncomment the ones you want to calculate\n",
    "        \n",
    "        #print('\\t ...Calculating TTR')\n",
    "        #ttr = ld.ttr(lowered_tokens_no_punct) # We've already calculated TTR above (see: df_counts); now we're using the lexical-diversity package\n",
    "        \n",
    "        #print('\\t ...Calculating Giraud')\n",
    "        #giraud = ld.root_ttr(lowered_tokens_no_punct) # Number of types / square root of the number of tokens (Giraud, 1960)\n",
    "        \n",
    "        #print('\\t ...Calculating MTLD')\n",
    "        #mtld = ld.mtld(lowered_tokens_no_punct)\n",
    "        \n",
    "        #print('\\t ...Calculating MSTTR') # Mean Segmental Type-Token Ratio\n",
    "        #msttr = ld.msttr(lowered_tokens_no_punct,window_length=10000)\n",
    "        \n",
    "        #print('\\t ...Calculating Maas TTR') # Maas\n",
    "        #maas = ld.maas_ttr(lowered_tokens_no_punct)\n",
    "        \n",
    "        #print('\\t ...Calculating Log TTR') # Log\n",
    "        #log = ld.log_ttr(lowered_tokens_no_punct)\n",
    "        \n",
    "        print('\\t ...Calculating MATTR') # Moving-Average Type-Token Ratio (Covington & McFall, 2010)\n",
    "        mattr = ld.mattr(lowered_tokens_no_punct,window_length=10000)\n",
    "        \n",
    "        #print('\\t ...Calculating HDD') # Hypergeometric distribution D (McCarthy and Jarvis (2007, 2010))\n",
    "        #hdd = ld.hdd(lowered_tokens_no_punct) \n",
    "        \n",
    "        #print('\\t ...Calculating MTLD ma. wrap') # MTLD moving average, wrap; takes a couple of hours to calculate on a normal CPU!\n",
    "        #mtld_ma_wrap = ld.mtld_ma_wrap(lowered_tokens_no_punct)\n",
    "        \n",
    "        #print('\\t ...Calculating MTLD ma. bid.') # Measure of lexical textual diversity (moving average, bi-directional)\n",
    "        #mtld_ma_bid = ld.mtld_ma_bid(lowered_tokens_no_punct)\n",
    "        \n",
    "        \n",
    "        df_lex_div.append({'File': text,\n",
    "                           'Title': title,\n",
    "                           'HP': hp,\n",
    "                           'Chronology': chron,\n",
    "                           'Tokens no punct': len(tokens_no_punct),\n",
    "                           #'Type-to-token ratio': ttr, \n",
    "                           #'Giraud': giraud,\n",
    "                           #'MTLD': mtld, \n",
    "                           #'MSTTR': msttr,\n",
    "                           'MATTR': mattr,\n",
    "                           #'HDD': hdd,\n",
    "                           #'MAAS': maas,\n",
    "                           #'LOG': log,\n",
    "                           #'MTLD, ma. wrap': mtld_ma_wrap,\n",
    "                           #'MTLD ma. bid.': mtld_ma_bid\n",
    "                          })\n",
    "\n",
    "df_lex_div = pd.DataFrame(df_lex_div)\n",
    "df_lex_div"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "x = list(df_lex_div[\"Chronology\"])\n",
    "y = list(df_lex_div[\"MATTR\"])\n",
    "\n",
    "gradient, intercept, r_value, p_value, std_err = stats.linregress(x, y)\n",
    "print(stats.linregress(x, y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import altair as alt\n",
    "import numpy as np\n",
    "\n",
    "base = alt.Chart(df_lex_div).properties(width=520, height=300)\n",
    "\n",
    "line_ttr = base.mark_bar(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Type-to-token ratio', sort=[\"HP\"], title='Type-to-token ratio', bin=True))\n",
    "\n",
    "line_giraud = base.mark_bar(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Giraud', sort=[\"HP\"], title='Giraud', bin=True))\n",
    "\n",
    "line_mtld = base.mark_bar(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('MTLD', sort=[\"HP\"], title='MTLD', bin=True))\n",
    "\n",
    "line_hdd = base.mark_bar(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('HDD', sort=[\"HP\"], title='HDD', bin=True))\n",
    "\n",
    "line_msttr = base.mark_bar(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('MSTTR', sort=[\"HP\"], title='MSTTR', bin=True))\n",
    "\n",
    "line_mattr = base.mark_bar().encode(\n",
    "    alt.X('HP:N', sort=['HP1', 'HP2'], title=None),\n",
    "    alt.Y('MATTR:Q', sort=['HP'], title='MATTR'))# scale=alt.Scale(domain=(0.15, 0.24))))\n",
    "\n",
    "line_maas = base.mark_bar(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('MAAS', sort=[\"HP\"], title='MAAS', bin=True))\n",
    "\n",
    "line_log = base.mark_bar(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('LOG', sort=[\"HP\"], title='LOG', bin=True))\n",
    "\n",
    "line_mattr.configure_axisX(labelAngle=15)\n",
    "#alt.vconcat(line_ttr, line_giraud, line_mtld, line_hdd, line_msttr, line_mattr, line_maas, line_log).configure_axisX(labelAngle=0).configure_axisX(labelAngle=-25)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Add mean and show grid lines\n",
    "\n",
    "mean = alt.Chart(df_lex_div).mark_rule(color='red').encode(\n",
    "    y='mean(MATTR):Q'\n",
    ")\n",
    "\n",
    "line_mattr_mean = (line_mattr + mean).properties(width=520)\n",
    "line_mattr_mean = line_mattr_mean.configure_axisX(grid=True, labelAngle=12)\n",
    "#line_mattr_mean.save('gfx/line_mattr_10000.png', dpi=500)\n",
    "line_mattr_mean\n",
    "#line_mattr_mean.save('../../output/lexicaldiversity/mattr.png')\n",
    "\n",
    "#df_lex_div['MATTR'].mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 3. Subordinating clauses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import nltk\n",
    "import benepar\n",
    "\n",
    "from benepar.spacy_plugin import BeneparComponent\n",
    "\n",
    "nlp = spacy.load('en_core_web_sm')\n",
    "nlp.max_length = 13000000\n",
    "nlp.add_pipe(BeneparComponent('benepar_en2'))\n",
    "\n",
    "df_sub = []\n",
    "\n",
    "for text, hp, chron, title in zip(texts_uk, hps, chronology, titles):\n",
    "    print(f'Tokenizing {title}...')\n",
    "    \n",
    "    with open(f'{corpus_folder}/{text}.txt','r') as file:\n",
    "        \n",
    "        ## Read text\n",
    "        raw_text =  file.read()\n",
    "        \n",
    "        print('\\t Converting to nlp...')\n",
    "        ## Convert to SpaCy nlp-object\n",
    "        doc = nlp(raw_text)\n",
    "\n",
    "        sbars = 0\n",
    "        \n",
    "        sbars_more_than_one = 0\n",
    "        sbars_0 = 0\n",
    "        sbars_1 = 0\n",
    "        sbars_2 = 0\n",
    "        sbars_3 = 0\n",
    "        sbars_more_than_3 = 0\n",
    "\n",
    "        sentences = list(doc.sents)\n",
    "        print('\\t Number of sentences:', len(sentences))\n",
    "\n",
    "        for sentence in sentences:\n",
    "            parsed = (sentence._.parse_string)\n",
    "            count = parsed.count(\"SBAR\")\n",
    "            \n",
    "            sbars += count\n",
    "            \n",
    "            if count > 1:\n",
    "                sbars_more_than_one += 1\n",
    "            \n",
    "            if count == 0:\n",
    "                sbars_0 += 1\n",
    "            \n",
    "            if count == 1:\n",
    "                sbars_1 += 1\n",
    "            \n",
    "            if count == 2:\n",
    "                sbars_2 += 1\n",
    "            \n",
    "            if count == 3:\n",
    "                sbars_3 += 1\n",
    "            \n",
    "            if count > 3:\n",
    "                sbars_more_than_3 += 1\n",
    "                \n",
    "        df_sub.append({'File': text,\n",
    "                       'Title': title,\n",
    "                       'HP': hp,\n",
    "                       'Chronology': chron,\n",
    "                       'Sentences with at least one subordinate clause': (sbars/ len(sentences))*100,\n",
    "                       'Sentences with two or more subordinate clauses': (sbars_more_than_one/len(sentences))*100,\n",
    "                       'Zero': (sbars_0/len(sentences))*100,\n",
    "                       'One': (sbars_1/len(sentences))*100,\n",
    "                       'Two': (sbars_2/len(sentences))*100,\n",
    "                       'Three': (sbars_3/len(sentences))*100,\n",
    "                       'More than three': (sbars_more_than_3/len(sentences))*100})\n",
    "                                                                          \n",
    "df_sub = pd.DataFrame(df_sub)\n",
    "df_sub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## In order to get **all** the density scores into one graph, we need to \"melt\" the dataframe\n",
    "\n",
    "df_sub_melted = df_sub.melt(id_vars=[\"Chronology\", \"HP\", \"File\", \"Title\", \"Zero\", \"One\", \"Two\", \"Three\", \"More than three\"], \n",
    "        var_name=\"Number of subordinate clauses\", \n",
    "        value_name=\"Ratio\")\n",
    "df_sub_melted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import seaborn as sns; sns.set(color_codes=True)\n",
    "sns.set(style=\"whitegrid\")\n",
    "\n",
    "from scipy import stats\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "plt.figure(figsize=(14, 8))\n",
    "\n",
    "ax = sns.barplot(x='HP', y='Ratio', hue='Number of subordinate clauses', data=df_sub_melted)\n",
    "ax.legend().set_title(\"\")\n",
    "\n",
    "#new = new.set(xlabel=\"\", ylabel=\"\")\n",
    "\n",
    "sns.regplot(x=\"Chronology\", y=\"Sentences with at least one subordinate clause\", data=df_sub, scatter=False, color='lightsteelblue')\n",
    "sns.regplot(x=\"Chronology\", y=\"Sentences with two or more subordinate clauses\", data=df_sub, scatter=False, color='orange')\n",
    "ax.set(xlabel=None, ylabel=\"Percentage of sentences\")\n",
    "plt.legend(loc='upper left')\n",
    "\n",
    "plt.savefig('gfx/full_chart_sub_clauses.png', dpi=500)\n",
    "\n",
    "#np.std(df_sub_only_hp[\"Sentences with at least one subordinate clause\"])\n",
    "\n",
    "pp = pprint.PrettyPrinter(indent=4)\n",
    "\n",
    "x = list(df_sub[\"Sentences with two or more subordinate clauses\"])\n",
    "y = list(df_sub[\"Chronology\"])\n",
    "\n",
    "gradient, intercept, r_value, p_value, std_err = stats.linregress(x, y)\n",
    "print(stats.linregress(x, y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Build graph\n",
    "import altair as alt\n",
    "\n",
    "sub_clauses = alt.Chart(df_sub_melted).mark_bar(point=True, strokeWidth=2.5).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Ratio', title='Sentence ratio', scale=alt.Scale(zero=False)),\n",
    "    alt.Color('Number of subordinate clauses', title=None)\n",
    ").properties(width=570, height=350).configure_axisX(labelAngle=0)\n",
    "\n",
    "## Add legend to right top corner\n",
    "sub_clauses = sub_clauses.configure_legend(\n",
    "    strokeColor='gray',\n",
    "    orient=\"top\",\n",
    "    titleAlign='center',\n",
    "    #fillColor='#EEEEEE',\n",
    "    padding=10,\n",
    "    cornerRadius=10,\n",
    "    labelLimit= 0)#.configure_axis(labelFontSize = 11)\n",
    "\n",
    "## Add grid lines\n",
    "full_chart_sub_clauses_1 = sub_clauses.configure_axis(grid=True)\n",
    "\n",
    "## Save chart \n",
    "full_chart_sub_clauses_1.save('gfx/full_chart_sub_clauses.svg', dpi=500)\n",
    "full_chart_sub_clauses_1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## In order to get **all** the density scores into one graph, we need to \"melt\" the dataframe\n",
    "\n",
    "df_sub_melted = df_sub.melt(id_vars=[\"HP\", \"File\", \"Title\", \"Zero\", \"Sentences with at least one subordinate clause\", \"Sentences with two or more subordinate clauses\"], \n",
    "        var_name=\"Number of subordinate clauses\", \n",
    "        value_name=\"Ratio\")\n",
    "\n",
    "df_sub_melted.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Build graph\n",
    "sub_clauses = alt.Chart(df_sub_melted).mark_line(point=True, strokeWidth=2.5).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Ratio', title='Sentence ratio', scale=alt.Scale(zero=False)),\n",
    "    alt.Color('Number of subordinate clauses', title=None)\n",
    ").properties(width=570, height=350).configure_axisX(labelAngle=0)\n",
    "\n",
    "## Add legend to right top corner\n",
    "sub_clauses = sub_clauses.configure_legend(\n",
    "    strokeColor='gray',\n",
    "    orient=\"top\",\n",
    "    #fillColor='#EEEEEE',\n",
    "    padding=10,\n",
    "    cornerRadius=10).configure_axis(labelFontSize = 11)\n",
    "\n",
    "## Add grid lines\n",
    "full_chart_sub_clauses = sub_clauses.configure_axis(grid=True)\n",
    "\n",
    "## Save chart \n",
    "#full_chart_sub_clauses.save('../../output/lexicaldiversity/full_chart_sub_clauses.html')\n",
    "#full_chart_sub_clauses.save('../../output/lexicaldiversity/full_chart_sub_clauses.svg')\n",
    "full_chart_sub_clauses"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 4. Readability"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4.1. For **entire** texts"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Score text \"Readability\" with popular metrics such as Flesch-Kincaid, Gunning Fog, ARI, Dale Chall, SMOG, etc."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## https://pypi.org/project/readability/\n",
    "\n",
    "!pip install https://github.com/andreasvc/readability/tarball/master\n",
    "import readability\n",
    "\n",
    "df_readability = [] \n",
    "df_sentence_beginnings = []\n",
    "\n",
    "for text, hp, chron, title in zip(texts_uk, hps, chronology, titles):\n",
    "    print(f'Getting readability scores for {title}...')\n",
    "\n",
    "    sentences_tokenized = []\n",
    "    \n",
    "    with open(f'{corpus_folder}/{text}.txt','r') as file:\n",
    "        \n",
    "        # Read text\n",
    "        raw_text =  file.read()\n",
    "        \n",
    "        # Convert to SpaCy nlp-object\n",
    "        doc = nlp(raw_text)\n",
    "        \n",
    "        print('\\t Tokenizing sentences')\n",
    "        sentences = [sent.string.strip() for sent in doc.sents]\n",
    "        \n",
    "        t = '\\n'.join(''.join(sub) for sub in sentences)\n",
    "        \n",
    "        results = readability.getmeasures(sentences, lang='en')\n",
    "        \n",
    "        ## Readability scores\n",
    "        \n",
    "        print('\\t Calculating readability scores')\n",
    "        flesch = results['readability grades']['FleschReadingEase']\n",
    "        flesch_kincaid = results['readability grades']['Kincaid']\n",
    "        ari = results['readability grades']['ARI']\n",
    "        coleman_liau = results['readability grades']['Coleman-Liau']\n",
    "        gunning_fog = results['readability grades']['GunningFogIndex']\n",
    "        lix = results['readability grades']['LIX']\n",
    "        smog = results['readability grades']['SMOGIndex']\n",
    "        rix = results['readability grades']['RIX']\n",
    "        dale_chall = results['readability grades']['DaleChallIndex']\n",
    "        \n",
    "        df_readability.append({'File': text,\n",
    "                               'Title': title,\n",
    "                               'HP': hp,\n",
    "                               'Chronology': chron,\n",
    "                               'Flesch Reading Ease': flesch,\n",
    "                               'Flesch-Kincaid': flesch_kincaid,\n",
    "                               'ARI': ari,\n",
    "                               'Coleman-Liau': coleman_liau,\n",
    "                               'Gunning fog': gunning_fog,\n",
    "                               'LIX': lix,\n",
    "                               'SMOG': smog,\n",
    "                               'RIX': rix,\n",
    "                               'Dale-Chall': dale_chall})\n",
    "        \n",
    "        \n",
    "        # Sentence beginnings information\n",
    "        \n",
    "        sentence_lenght = results['sentence info']['sentences']\n",
    "        \n",
    "        pronoun =  results['sentence beginnings']['pronoun']\n",
    "        interrogative = results['sentence beginnings']['interrogative']\n",
    "        article = results['sentence beginnings']['article']\n",
    "        subordination = results['sentence beginnings']['subordination']\n",
    "        conjunction = results['sentence beginnings']['conjunction']\n",
    "        preposition = results['sentence beginnings']['preposition']\n",
    "        \n",
    "        df_sentence_beginnings.append({'File': text,\n",
    "                                       'Title': title,\n",
    "                                       'HP': hp,\n",
    "                                       'Pronoun': (pronoun/sentence_lenght),\n",
    "                                       'Interrogative': (interrogative/sentence_lenght),\n",
    "                                       'Article': (article/sentence_lenght),\n",
    "                                       'Subordination': (subordination/sentence_lenght),\n",
    "                                       'Conjunction': (conjunction/sentence_lenght),\n",
    "                                       'Preposition': (preposition/sentence_lenght)})\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_readability = pd.DataFrame(df_readability)\n",
    "df_readability"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import altair as alt\n",
    "\n",
    "base = alt.Chart(df_readability).properties(width=500, height=150)\n",
    "\n",
    "line_f = base.mark_line(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Flesch Reading Ease', sort=[\"HP\"], title='Flesch', scale=alt.Scale(zero=False)))\n",
    "\n",
    "line_fk = base.mark_line(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Flesch-Kincaid', sort=[\"HP\"], title='Flesch-Kincaid', scale=alt.Scale(zero=False)))\n",
    "\n",
    "line_dc = base.mark_line(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Dale-Chall', sort=[\"HP\"], title='Dale-Chall', scale=alt.Scale(zero=False)))\n",
    "\n",
    "line_ari = base.mark_line(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('ARI', sort=[\"HP\"], title='ARI', scale=alt.Scale(zero=False)))\n",
    "\n",
    "line_smog = base.mark_line(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('SMOG', sort=[\"HP\"], title='SMOG', scale=alt.Scale(zero=False)))\n",
    "\n",
    "line_cl = base.mark_line(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Coleman-Liau', sort=[\"HP\"], title='Coleman-Liau', scale=alt.Scale(zero=False)))\n",
    "\n",
    "line_gf = base.mark_line(point=True).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('Gunning fog', sort=[\"HP\"], title='Gunning fog', scale=alt.Scale(zero=False)))\n",
    "\n",
    "alt.vconcat(line_f, line_fk, line_dc, line_ari, line_smog, line_cl, line_gf).configure_axisX(labelAngle=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# In order to get **all** the density scores into one graph, we need to \"melt\" the dataframe\n",
    "\n",
    "df_readability_melted = df_readability.melt(id_vars=[\"HP\", \"File\", 'RIX', 'LIX', 'Flesch Reading Ease', 'Title', 'Chronology'], # We leave ut RIX, LIX and Flesch scores for now, because they use diffferent ranges\n",
    "        var_name=\"readability\", \n",
    "        value_name=\"score\")\n",
    "\n",
    "df_readability_melted.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build graph\n",
    "full_chart_readability = alt.Chart(df_readability_melted).mark_line(point=True, strokeWidth=2.5).encode(\n",
    "    alt.X('HP', title=None),\n",
    "    alt.Y('score', title='Score', scale=alt.Scale(zero=False)),\n",
    "    alt.Color('readability', title=None)\n",
    ").properties(width=700, height=400).configure_axisX(labelAngle=0)\n",
    "\n",
    "# Add legend to right top corner\n",
    "full_chart_readability = full_chart_readability.configure_legend(\n",
    "    strokeColor='gray',\n",
    "    #fillColor='#EEEEEE',\n",
    "    orient=\"top\",\n",
    "    padding=10,\n",
    "    cornerRadius=10).configure_axis(labelFontSize = 11)\n",
    "\n",
    "# Add grid lines\n",
    "full_chart_readability = full_chart_readability.configure_axis(grid=True)\n",
    "\n",
    "# Save chart \n",
    "#full_chart_readability.save('../../output/lexicaldiversity/readability.html')\n",
    "#full_chart_readability.save('../../output/lexicaldiversity/readability.svg')\n",
    "#full_chart_readability.save('../../output/lexicaldiversity/readability.png')\n",
    "full_chart_readability"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_sentence_beginnings = pd.DataFrame(df_sentence_beginnings)\n",
    "df_sentence_beginnings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### 4.2. Readability scores for **samples**"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from random import shuffle\n",
    "import random\n",
    "\n",
    "df_readability_samples = []\n",
    "\n",
    "df_readability_samples_all_grades = []\n",
    "\n",
    "for chron, text, hp, title in zip(chronology, texts_uk, hps, titles):\n",
    "    print(f'Getting readability scores for {title}...')\n",
    "\n",
    "    sentences_tokenized = []\n",
    "    \n",
    "    flesch_grades = []\n",
    "    flesch_kincaid_grades = []\n",
    "    ari_grades = []\n",
    "    coleman_liau_grades = []\n",
    "    gunning_fog_grades = []\n",
    "    lix_grades = []\n",
    "    smog_grades = []\n",
    "    rix_grades = []\n",
    "    dale_chall_grades = []\n",
    "    all_grades_combined  = []\n",
    "    \n",
    "    with open(f'{corpus_folder}/{text}.txt','r') as file:\n",
    "        \n",
    "        ## Read text\n",
    "        raw_text =  file.read()\n",
    "        \n",
    "        doc = nlp(raw_text)\n",
    "        sentences = [sent.string.strip() for sent in doc.sents]\n",
    "        #shuffle(sentences) ---> uncomment if you want to shuffle the sentences!\n",
    "                \n",
    "        n = 200 # number of sentences per sample (sample size)\n",
    "        \n",
    "        print(f'\\tBuilding samples of {n} sentences...')\n",
    "        \n",
    "        sent_samples = [sentences[i:i + n] for i in range(0, len(sentences), n)]\n",
    "        \n",
    "        random_sent_sample = random.sample(sent_samples, 25)\n",
    "        \n",
    "        print('\\tNumber of samples:', len(random_sent_sample))\n",
    "        \n",
    "        for sample in random_sent_sample:\n",
    "            \n",
    "            results = readability.getmeasures(sample, lang='en')\n",
    "        \n",
    "            # Readability scores\n",
    "        \n",
    "            #flesch = results['readability grades']['FleschReadingEase']\n",
    "            #flesch_grades.append(flesch)\n",
    "            \n",
    "            flesch_kincaid = results['readability grades']['Kincaid']\n",
    "            flesch_kincaid_grades.append(flesch_kincaid)\n",
    "            all_grades_combined.append(flesch_kincaid)\n",
    "            \n",
    "            ari = results['readability grades']['ARI']\n",
    "            ari_grades.append(ari)\n",
    "            all_grades_combined.append(ari)\n",
    "            \n",
    "            coleman_liau = results['readability grades']['Coleman-Liau']\n",
    "            coleman_liau_grades.append(coleman_liau)\n",
    "            all_grades_combined.append(coleman_liau)\n",
    "            \n",
    "            gunning_fog = results['readability grades']['GunningFogIndex']\n",
    "            gunning_fog_grades.append(gunning_fog)\n",
    "            all_grades_combined.append(gunning_fog)\n",
    "            \n",
    "            #lix = results['readability grades']['LIX']\n",
    "            #lix_grades.append(lix)\n",
    "            \n",
    "            smog = results['readability grades']['SMOGIndex']\n",
    "            smog_grades.append(smog)\n",
    "            all_grades_combined.append(smog)\n",
    "            \n",
    "            #rix = results['readability grades']['RIX']\n",
    "            #rix_grades.append(rix)\n",
    "            \n",
    "            dale_chall = results['readability grades']['DaleChallIndex']\n",
    "            dale_chall_grades.append(dale_chall)\n",
    "            all_grades_combined.append(dale_chall)\n",
    "        \n",
    "            df_readability_samples.append({'File': text,\n",
    "                               'Title': title,\n",
    "                               'HP': hp,\n",
    "                               'Chronology': chron,\n",
    "                               #'Flesch Reading Ease': flesch_grades,\n",
    "                               'Flesch-Kincaid': flesch_kincaid_grades,\n",
    "                               'ARI': ari_grades,\n",
    "                               'Coleman-Liau': coleman_liau_grades,\n",
    "                               'Gunning fog': gunning_fog_grades,\n",
    "                               #'LIX': lix_grades,\n",
    "                               'SMOG': smog_grades,\n",
    "                               #'RIX': rix_grades,\n",
    "                               'Dale-Chall': dale_chall_grades})\n",
    "                                \n",
    "            \n",
    "            df_readability_samples_all_grades.append({'File': text,\n",
    "                                                      'Title': title,\n",
    "                                                      'HP': hp,\n",
    "                                                      'Chronology': chron,\n",
    "                                                      'All grades combined': all_grades_combined})\n",
    "            \n",
    "    print('\\tAverage sample Flesch-Kincaid grade:', sum(flesch_kincaid_grades)/len(flesch_kincaid_grades))\n",
    "    print('\\tAverage sample ARI grade:', sum(ari_grades)/len(ari_grades))\n",
    "    print('\\tAverage sample Coleman-Liau grade:', sum(coleman_liau_grades)/len(coleman_liau_grades))\n",
    "    print('\\tAverage sample Gunning fog grade:', sum(gunning_fog_grades)/len(gunning_fog_grades))\n",
    "    print('\\tAverage sample SMOG grade:', sum(smog_grades)/len(smog_grades))\n",
    "    print('\\tAverage sample Dale-Chall grade:', sum(dale_chall_grades)/len(dale_chall_grades))\n",
    "    \n",
    "    print('\\tAverage sample all grades combined:', sum(all_grades_combined)/len(all_grades_combined))\n",
    "\n",
    "df_readability_samples = pd.DataFrame(df_readability_samples)\n",
    "df_readability_samples\n",
    "\n",
    "df_readability_all = pd.DataFrame(df_readability_samples_all_grades)\n",
    "df_readability_all"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_readability_samples_exploded = explode(df_readability_samples, lst_cols=['SMOG', 'Gunning fog', 'Dale-Chall', 'Coleman-Liau', 'Flesch-Kincaid', 'ARI'])\n",
    "\n",
    "df_readability_samples_exploded_melted=pd.melt(df_readability_samples_exploded,id_vars=['File', 'Title', 'HP', 'Chronology'],value_vars=['SMOG', 'Gunning fog', 'Dale-Chall', 'Coleman-Liau', 'Flesch-Kincaid', 'ARI'],var_name='Readability scores')\n",
    "\n",
    "df_readability_samples_exploded_melted"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns; sns.set(color_codes=True)\n",
    "sns.set(style=\"whitegrid\")\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "plt.figure(figsize=(16, 10))\n",
    "plt.yticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])\n",
    "\n",
    "chart = sns.boxplot(x='HP', y='value', data=df_readability_samples_exploded_melted, hue='Readability scores', showfliers = False, showmeans=True, meanline=True, meanprops={\"linestyle\":\"--\", \"linewidth\":2, \"color\":'white'})\n",
    "#sns.regplot(x=\"Chronology\", y=\"SMOG\", data=df_readability_samples_exploded, scatter=False, color='lightsteelblue')\n",
    "#sns.regplot(x=\"Chronology\", y=\"Dale-Chall\", data=df_readability_samples_exploded, scatter=False, color='mediumaquamarine')\n",
    "#sns.regplot(x=\"Chronology\", y=\"Gunning fog\", data=df_readability_samples_exploded, scatter=False, color='peachpuff')\n",
    "plt.legend(loc='upper left', ncol=3)\n",
    "\n",
    "chart.set_xticklabels(chart.get_xticklabels(), rotation=0)\n",
    "chart.set(xlabel=None, ylabel='Grade level')\n",
    "plt.savefig(\"gfx/boxplot_readability_scores.pdf\", dpi=500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_readability_samples_exploded_all = explode(df_readability_all, lst_cols=['All grades combined'])\n",
    "\n",
    "df_readability_samples_exploded_melted_all=pd.melt(df_readability_samples_exploded_all,id_vars=['File', 'Title', 'HP', 'Chronology'],value_vars=['All grades combined'],var_name='Average grade level')\n",
    "\n",
    "df_readability_samples_exploded_melted_all\n",
    "\n",
    "plt.figure(figsize=(16, 7))\n",
    "plt.yticks([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])\n",
    "\n",
    "chart = sns.boxplot(x='HP', y='value', data=df_readability_samples_exploded_melted_all, hue='Average grade level', showfliers = False, showmeans=True, meanline=True, meanprops={\"linestyle\":\"--\", \"linewidth\":2, \"color\":'white'})\n",
    "#sns.regplot(x=\"Chronology\", y=\"SMOG\", data=df_readability_samples_exploded, scatter=False, color='lightsteelblue')\n",
    "#sns.regplot(x=\"Chronology\", y=\"Dale-Chall\", data=df_readability_samples_exploded, scatter=False, color='mediumaquamarine')\n",
    "#sns.regplot(x=\"Chronology\", y=\"Gunning fog\", data=df_readability_samples_exploded, scatter=False, color='peachpuff')\n",
    "plt.legend(loc='upper left', ncol=3)\n",
    "\n",
    "chart.set_xticklabels(chart.get_xticklabels(), rotation=0)\n",
    "chart.set(xlabel=None, ylabel='Grade level')\n",
    "plt.legend(['Readability scores\\' grade levels combined'])\n",
    "plt.savefig(\"gfx/boxplot_readability_scores_all.pdf\", dpi=500)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from scipy import stats\n",
    "import pprint\n",
    "pp = pprint.PrettyPrinter(indent=4)\n",
    "\n",
    "## Get correlation between chronology and specific readability measure\n",
    "\n",
    "x = list(df_readability_samples_exploded['Chronology'])\n",
    "y = list(df_readability_samples_exploded['SMOG'])\n",
    "\n",
    "gradient, intercept, r_value, p_value, std_err = stats.linregress(x, y)\n",
    "print(stats.linregress(x, y))\n",
    "\n",
    "result = stats.kendalltau(x, y)\n",
    "print('Kendall Tau:', result)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "## Get correlation between chronology and all readability measure values\n",
    "\n",
    "x = list(df_readability_samples_exploded_melted_all['value'])\n",
    "y = list(df_readability_samples_exploded_melted_all['Chronology'])\n",
    "\n",
    "gradient, intercept, r_value, p_value, std_err = stats.linregress(x, y)\n",
    "print(stats.linregress(x, y))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df_readability_samples_exploded.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "correlations = df_readability_samples_exploded.corr(method='kendall')\n",
    "correlations"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dropped = correlations.drop(columns=['Chronology'])\n",
    "dropped = dropped.drop('Chronology')\n",
    "dropped"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3.6",
   "language": "python",
   "name": "py36"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
